from numpy import *
import operator
from os import listdir
import matplotlib
import matplotlib.pyplot as plt

# 数据集的路径
filePath = 'E:\\bigData\\ml\\dataset\\datingTestSet2.txt'

# kNN核心算法
'''
classify0函数：
    参数解释：
        inX：待分类样本
        dataSet：已分类的数据集（训练集）
        labels：已分类的数据集的类别
        k：选取最近距离的样本个数
    功能解释：
        通过计算样本inX与dataSet中各个样本的距离，
        选出k个距离最近的样本，
        挑选在这些样本中出现次数最多的种类，
        将该种类预测作inX的种类
'''
def classify0(inX, dataSet, labels, k):
    # 读取训练集矩阵中向量（样本）的个数
    dataSetSize = dataSet.shape[0]
    # 训练集向量（样本）与待分类向量（样本）的差值，类似于(x1-y1)
    diffMat = tile(inX, (dataSetSize, 1)) - dataSet
    # 类似于（x1-y1）^2
    sqDiffMat = diffMat**2
    # 类似于 （x1-y1）^2 + （x2-y2）^2 + ... + （xn-yn）^2
    sqDistances = sqDiffMat.sum(axis=1)
    # 对sqDistances进行开方，得到两个样本的欧式距离
    distances = sqDistances**0.5
    # 计算出待分类样本与各个已分类样本的距离后，
    # 将这些距离从小到大排序，提取其对应的index（索引），输出到sortedDistIndicies
    # 例如：第一个样本与待分类样本的距离在所有距离中排第706名，那么将706记录到sortedDistIndicies列表中
    sortedDistIndicies = distances.argsort()
    classCount = {}
    # 取出k个距离最近的样本。选出在这些样本中出现次数最多的种类，那么该种类就为预测结果
    for i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]

# 由文件转换成矩阵
def file2matrix(filename):
    fr = open(filename)
    numberOfLines = len(fr.readlines())         #get the number of lines in the file
    returnMat = zeros((numberOfLines, 3))        #prepare matrix to return
    classLabelVector = []                       #prepare labels return
    fr = open(filename)
    index = 0
    for line in fr.readlines():
        line = line.strip()
        listFromLine = line.split('\t')
        returnMat[index, :] = listFromLine[0:3]
        classLabelVector.append(int(listFromLine[-1]))
        index += 1
    return returnMat, classLabelVector

# 将特征值进行归一化，映射到0-1之间
def autoNorm(dataSet):
    minVals = dataSet.min(0)
    maxVals = dataSet.max(0)
    ranges = maxVals - minVals
    normDataSet = zeros(shape(dataSet))
    m = dataSet.shape[0]
    normDataSet = dataSet - tile(minVals, (m,1))
    normDataSet = normDataSet/tile(ranges, (m,1))   #element wise divide
    return normDataSet, ranges, minVals

# 测试kNN分类算法
def datingClassTest():
    hoRatio = 0.50      #hold out 10%
    datingDataMat, datingLabels = file2matrix(filePath)       #load data setfrom file
    normMat, ranges, minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    numTestVecs = int(m*hoRatio)
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
        print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i]))
        if (classifierResult != datingLabels[i]): errorCount += 1.0
    print("the total error rate is: %f" % (errorCount/float(numTestVecs)))
    print(errorCount)


# 分类测试算法
def classifyPerson():
    resultList = ['not at all', 'in small doses', 'in large doses']
    percentTats = float(input("percentage of time spent playing video games?"))
    ffMiles = float(input("frequent flier miles earned per year?"))
    iceCream = float(input("litres of ice cream consumed per year?"))
    datingDataMat, datingLabels = file2matrix(filePath)
    normMat, ranges, minVals = autoNorm(datingDataMat)
    inArr = array([ffMiles, percentTats, iceCream])
    classifierResult = classify0((inArr-minVals)/ranges, normMat, datingLabels, 3)
    print("Your will probably like this person:", resultList[classifierResult-1])


# 数据可视化函数（可选）
def writeData2Picture():
    datingDataMat, datingLabels = loadingData()
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(datingDataMat[:, 0], datingDataMat[:, 1], 15.0*array(datingLabels), 15.0*array(datingLabels))
    plt.show()

# 数据加载函数（可选）
def loadingData():
    datingDataMat, datingLabels = file2matrix('E:\\bigData\\ml\\dataset\\datingTestSet2.txt')
    # print(datingDataMat)
    # print(datingLabels)
    return  datingDataMat, datingLabels

# 第一第二个函数的测试函数，将其放在主函数中运行（可选）
def myKNNTest1():
    group, labels = loadingData()
    print(group) # 测试输出代码
    category = classify0([0, 0], group, labels, 3)
    print(category)

# 主函数
if __name__ == '__main__':
    classifyPerson()
